{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from __future__ import print_function\n",
    "import pandas as pd \n",
    "import sys\n",
    "sys.path.append('/ref/analysis/pipelines/')\n",
    "import kang"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# new gene annotation Interpro\n",
    "file_annot = 'pep.fa.cdhit.nostar.tsv'\n",
    "df_annot   = pd.read_csv(file_annot,sep='\\t',header=None)\n",
    "df_annot_ix = df_annot.set_index(0)\n",
    "\n",
    "file_new = '/ref/analysis/stringtie.addcds/cuffcmp.my_csv.csv.addgene.gff3.sort.gff3.tmap.new'\n",
    "df_new   = pd.read_csv(file_new,sep='\\t',header=None)\n",
    "\n",
    "genenames = set(df_new[4].values)\n",
    "i = 0 \n",
    "for g in genenames:\n",
    "    try:\n",
    "        if i == 0:\n",
    "            df = df_annot_ix.loc[g]\n",
    "            i += 1\n",
    "        else:\n",
    "            df = df.append(df_annot_ix.loc[g])\n",
    "    except KeyError:\n",
    "        continue\n",
    "df.to_csv(file_new.split('/')[-1]+'.annot',sep='\\t',header=None)\n",
    "# new gene annotation end "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# new gene protein seq ret \n",
    "file_fa       = 'pep.fa'\n",
    "dicfa         = kang.Fasta2dic(file_fa)\n",
    "df_new['seq'] = df_new[4].apply(lambda x : dicfa[x])\n",
    "\n",
    "with open('new_gene.fa','w') as f:\n",
    "    for ix in df_new.index:\n",
    "        hd  = df_new.loc[ix][11]\n",
    "        seq = df_new.loc[ix]['seq']\n",
    "        print('>'+hd,file=f)\n",
    "        print(seq,file=f)\n",
    "# new gene protein seq ret end "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>10</th>\n",
       "      <th>11</th>\n",
       "      <th>annotation</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>STRG.28.2|m.13357</td>\n",
       "      <td>sp|P16423|POLR_DROME</td>\n",
       "      <td>25.61</td>\n",
       "      <td>453</td>\n",
       "      <td>284</td>\n",
       "      <td>12</td>\n",
       "      <td>1238</td>\n",
       "      <td>1685</td>\n",
       "      <td>354</td>\n",
       "      <td>758</td>\n",
       "      <td>4.000000e-22</td>\n",
       "      <td>108.0</td>\n",
       "      <td>Retrovirus-related Pol polyprotein from type-2...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>STRG.28.2|m.13357</td>\n",
       "      <td>sp|P16423|POLR_DROME</td>\n",
       "      <td>25.61</td>\n",
       "      <td>453</td>\n",
       "      <td>284</td>\n",
       "      <td>12</td>\n",
       "      <td>1238</td>\n",
       "      <td>1685</td>\n",
       "      <td>354</td>\n",
       "      <td>758</td>\n",
       "      <td>4.000000e-22</td>\n",
       "      <td>108.0</td>\n",
       "      <td>Retrovirus-related Pol polyprotein from type-2...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>STRG.184.1|m.26632</td>\n",
       "      <td>sp|Q9VMJ7|KDM5_DROME</td>\n",
       "      <td>39.53</td>\n",
       "      <td>86</td>\n",
       "      <td>46</td>\n",
       "      <td>2</td>\n",
       "      <td>191</td>\n",
       "      <td>270</td>\n",
       "      <td>445</td>\n",
       "      <td>530</td>\n",
       "      <td>6.000000e-12</td>\n",
       "      <td>72.0</td>\n",
       "      <td>Lysine-specific demethylase lid OS=Drosophila ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>STRG.185.1|m.15264</td>\n",
       "      <td>sp|Q66H30|GIN1_RAT</td>\n",
       "      <td>28.14</td>\n",
       "      <td>199</td>\n",
       "      <td>135</td>\n",
       "      <td>5</td>\n",
       "      <td>37</td>\n",
       "      <td>230</td>\n",
       "      <td>15</td>\n",
       "      <td>210</td>\n",
       "      <td>2.000000e-16</td>\n",
       "      <td>81.6</td>\n",
       "      <td>Gypsy retrotransposon integrase-like protein 1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>STRG.261.1|m.2566</td>\n",
       "      <td>sp|O04279|GPA2_PEA</td>\n",
       "      <td>27.91</td>\n",
       "      <td>86</td>\n",
       "      <td>53</td>\n",
       "      <td>1</td>\n",
       "      <td>38</td>\n",
       "      <td>114</td>\n",
       "      <td>146</td>\n",
       "      <td>231</td>\n",
       "      <td>6.000000e-03</td>\n",
       "      <td>40.0</td>\n",
       "      <td>Guanine nucleotide-binding protein alpha-2 sub...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>STRG.257.1|m.16086</td>\n",
       "      <td>sp|P14381|YTX2_XENLA</td>\n",
       "      <td>26.59</td>\n",
       "      <td>173</td>\n",
       "      <td>107</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>174</td>\n",
       "      <td>550</td>\n",
       "      <td>704</td>\n",
       "      <td>3.000000e-08</td>\n",
       "      <td>56.6</td>\n",
       "      <td>Transposon TX1 uncharacterized 149 kDa protein...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>STRG.467.2</td>\n",
       "      <td>sp|Q5XJW2|G45IP_RAT</td>\n",
       "      <td>38.10</td>\n",
       "      <td>42</td>\n",
       "      <td>26</td>\n",
       "      <td>0</td>\n",
       "      <td>69</td>\n",
       "      <td>110</td>\n",
       "      <td>107</td>\n",
       "      <td>148</td>\n",
       "      <td>7.000000e-03</td>\n",
       "      <td>38.5</td>\n",
       "      <td>Growth arrest and DNA damage-inducible protein...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>STRG.559.1</td>\n",
       "      <td>sp|Q91YT2|RN185_MOUSE</td>\n",
       "      <td>35.09</td>\n",
       "      <td>57</td>\n",
       "      <td>33</td>\n",
       "      <td>2</td>\n",
       "      <td>135</td>\n",
       "      <td>188</td>\n",
       "      <td>37</td>\n",
       "      <td>92</td>\n",
       "      <td>1.000000e-08</td>\n",
       "      <td>55.5</td>\n",
       "      <td>E3 ubiquitin-protein ligase RNF185 OS=Mus musc...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>STRG.807.7|m.2445</td>\n",
       "      <td>sp|Q9FNQ1|DEXHE_ARATH</td>\n",
       "      <td>47.20</td>\n",
       "      <td>1125</td>\n",
       "      <td>480</td>\n",
       "      <td>18</td>\n",
       "      <td>91</td>\n",
       "      <td>1211</td>\n",
       "      <td>115</td>\n",
       "      <td>1129</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>959.0</td>\n",
       "      <td>DExH-box ATP-dependent RNA helicase DExH14 OS=...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>STRG.807.7|m.2445</td>\n",
       "      <td>sp|Q9FNQ1|DEXHE_ARATH</td>\n",
       "      <td>35.08</td>\n",
       "      <td>630</td>\n",
       "      <td>388</td>\n",
       "      <td>8</td>\n",
       "      <td>581</td>\n",
       "      <td>1209</td>\n",
       "      <td>1349</td>\n",
       "      <td>1958</td>\n",
       "      <td>4.000000e-97</td>\n",
       "      <td>347.0</td>\n",
       "      <td>DExH-box ATP-dependent RNA helicase DExH14 OS=...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>STRG.1031.2</td>\n",
       "      <td>sp|P34152|FAK1_MOUSE</td>\n",
       "      <td>33.82</td>\n",
       "      <td>207</td>\n",
       "      <td>110</td>\n",
       "      <td>11</td>\n",
       "      <td>1373</td>\n",
       "      <td>1572</td>\n",
       "      <td>598</td>\n",
       "      <td>784</td>\n",
       "      <td>1.000000e-09</td>\n",
       "      <td>67.0</td>\n",
       "      <td>Focal adhesion kinase 1 OS=Mus musculus GN=Ptk...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>STRG.1131.1</td>\n",
       "      <td>sp|O77374|PF07_PLAF7</td>\n",
       "      <td>31.94</td>\n",
       "      <td>72</td>\n",
       "      <td>43</td>\n",
       "      <td>3</td>\n",
       "      <td>87</td>\n",
       "      <td>156</td>\n",
       "      <td>847</td>\n",
       "      <td>914</td>\n",
       "      <td>5.000000e-03</td>\n",
       "      <td>40.8</td>\n",
       "      <td>Uncharacterized protein PFC0810c OS=Plasmodium...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>STRG.1131.1</td>\n",
       "      <td>sp|O77374|PF07_PLAF7</td>\n",
       "      <td>31.94</td>\n",
       "      <td>72</td>\n",
       "      <td>43</td>\n",
       "      <td>3</td>\n",
       "      <td>87</td>\n",
       "      <td>156</td>\n",
       "      <td>847</td>\n",
       "      <td>914</td>\n",
       "      <td>5.000000e-03</td>\n",
       "      <td>40.8</td>\n",
       "      <td>Uncharacterized protein PFC0810c OS=Plasmodium...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60</th>\n",
       "      <td>STRG.12019.1|m.36072</td>\n",
       "      <td>sp|Q3U4G0|CO041_MOUSE</td>\n",
       "      <td>34.80</td>\n",
       "      <td>273</td>\n",
       "      <td>137</td>\n",
       "      <td>9</td>\n",
       "      <td>52</td>\n",
       "      <td>307</td>\n",
       "      <td>28</td>\n",
       "      <td>276</td>\n",
       "      <td>1.000000e-44</td>\n",
       "      <td>162.0</td>\n",
       "      <td>Uncharacterized protein C15orf41 homolog OS=Mu...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>62</th>\n",
       "      <td>STRG.12054.1|m.50679</td>\n",
       "      <td>sp|O96185|YPF08_PLAF7</td>\n",
       "      <td>36.54</td>\n",
       "      <td>104</td>\n",
       "      <td>53</td>\n",
       "      <td>10</td>\n",
       "      <td>54</td>\n",
       "      <td>145</td>\n",
       "      <td>524</td>\n",
       "      <td>626</td>\n",
       "      <td>4.000000e-07</td>\n",
       "      <td>54.7</td>\n",
       "      <td>Uncharacterized protein PFB0460c OS=Plasmodium...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63</th>\n",
       "      <td>STRG.12054.1|m.50679</td>\n",
       "      <td>sp|O96185|YPF08_PLAF7</td>\n",
       "      <td>36.00</td>\n",
       "      <td>75</td>\n",
       "      <td>38</td>\n",
       "      <td>7</td>\n",
       "      <td>76</td>\n",
       "      <td>144</td>\n",
       "      <td>519</td>\n",
       "      <td>589</td>\n",
       "      <td>4.000000e-03</td>\n",
       "      <td>42.4</td>\n",
       "      <td>Uncharacterized protein PFB0460c OS=Plasmodium...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>67</th>\n",
       "      <td>STRG.12109.1|m.31938</td>\n",
       "      <td>sp|P08548|LIN1_NYCCO</td>\n",
       "      <td>22.79</td>\n",
       "      <td>351</td>\n",
       "      <td>248</td>\n",
       "      <td>10</td>\n",
       "      <td>4</td>\n",
       "      <td>342</td>\n",
       "      <td>489</td>\n",
       "      <td>828</td>\n",
       "      <td>5.000000e-23</td>\n",
       "      <td>107.0</td>\n",
       "      <td>LINE-1 reverse transcriptase homolog OS=Nyctic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69</th>\n",
       "      <td>STRG.12117.1|m.31903</td>\n",
       "      <td>sp|P08548|LIN1_NYCCO</td>\n",
       "      <td>22.84</td>\n",
       "      <td>359</td>\n",
       "      <td>260</td>\n",
       "      <td>9</td>\n",
       "      <td>2</td>\n",
       "      <td>351</td>\n",
       "      <td>478</td>\n",
       "      <td>828</td>\n",
       "      <td>5.000000e-26</td>\n",
       "      <td>115.0</td>\n",
       "      <td>LINE-1 reverse transcriptase homolog OS=Nyctic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>81</th>\n",
       "      <td>STRG.12550.1|m.41810</td>\n",
       "      <td>sp|P14381|YTX2_XENLA</td>\n",
       "      <td>30.26</td>\n",
       "      <td>228</td>\n",
       "      <td>152</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>225</td>\n",
       "      <td>515</td>\n",
       "      <td>739</td>\n",
       "      <td>1.000000e-21</td>\n",
       "      <td>99.0</td>\n",
       "      <td>Transposon TX1 uncharacterized 149 kDa protein...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>88</th>\n",
       "      <td>STRG.12846.1|m.63519</td>\n",
       "      <td>sp|P16423|POLR_DROME</td>\n",
       "      <td>25.61</td>\n",
       "      <td>453</td>\n",
       "      <td>284</td>\n",
       "      <td>12</td>\n",
       "      <td>1186</td>\n",
       "      <td>1633</td>\n",
       "      <td>354</td>\n",
       "      <td>758</td>\n",
       "      <td>2.000000e-22</td>\n",
       "      <td>109.0</td>\n",
       "      <td>Retrovirus-related Pol polyprotein from type-2...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95</th>\n",
       "      <td>STRG.13248.1|m.59691</td>\n",
       "      <td>sp|Q9SPI9|PSBW_CHLRE</td>\n",
       "      <td>100.00</td>\n",
       "      <td>115</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>115</td>\n",
       "      <td>1</td>\n",
       "      <td>115</td>\n",
       "      <td>1.000000e-77</td>\n",
       "      <td>231.0</td>\n",
       "      <td>Photosystem II reaction center W protein, chlo...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96</th>\n",
       "      <td>STRG.13462.5</td>\n",
       "      <td>sp|Q6PBN4|CQ10X_DANRE</td>\n",
       "      <td>34.55</td>\n",
       "      <td>55</td>\n",
       "      <td>36</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>61</td>\n",
       "      <td>105</td>\n",
       "      <td>159</td>\n",
       "      <td>1.000000e-04</td>\n",
       "      <td>42.0</td>\n",
       "      <td>Coenzyme Q-binding protein COQ10 homolog, mito...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>STRG.13741.1|m.83638</td>\n",
       "      <td>sp|P11369|LORF2_MOUSE</td>\n",
       "      <td>32.14</td>\n",
       "      <td>84</td>\n",
       "      <td>57</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>89</td>\n",
       "      <td>606</td>\n",
       "      <td>689</td>\n",
       "      <td>2.000000e-09</td>\n",
       "      <td>58.2</td>\n",
       "      <td>LINE-1 retrotransposable element ORF2 protein ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>115</th>\n",
       "      <td>STRG.13853.1</td>\n",
       "      <td>sp|A2AMZ4|CQ089_MOUSE</td>\n",
       "      <td>32.20</td>\n",
       "      <td>59</td>\n",
       "      <td>29</td>\n",
       "      <td>2</td>\n",
       "      <td>9</td>\n",
       "      <td>64</td>\n",
       "      <td>15</td>\n",
       "      <td>65</td>\n",
       "      <td>6.000000e-03</td>\n",
       "      <td>35.0</td>\n",
       "      <td>Uncharacterized protein C17orf89 homolog OS=Mu...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>127</th>\n",
       "      <td>STRG.14659.1|m.80009</td>\n",
       "      <td>sp|P14381|YTX2_XENLA</td>\n",
       "      <td>29.03</td>\n",
       "      <td>155</td>\n",
       "      <td>102</td>\n",
       "      <td>3</td>\n",
       "      <td>19</td>\n",
       "      <td>171</td>\n",
       "      <td>583</td>\n",
       "      <td>731</td>\n",
       "      <td>4.000000e-11</td>\n",
       "      <td>65.1</td>\n",
       "      <td>Transposon TX1 uncharacterized 149 kDa protein...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>128</th>\n",
       "      <td>STRG.14661.1|m.84607</td>\n",
       "      <td>sp|P14381|YTX2_XENLA</td>\n",
       "      <td>29.92</td>\n",
       "      <td>127</td>\n",
       "      <td>83</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>123</td>\n",
       "      <td>121</td>\n",
       "      <td>246</td>\n",
       "      <td>3.000000e-08</td>\n",
       "      <td>55.1</td>\n",
       "      <td>Transposon TX1 uncharacterized 149 kDa protein...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>134</th>\n",
       "      <td>STRG.14686.1|m.77912</td>\n",
       "      <td>sp|P14381|YTX2_XENLA</td>\n",
       "      <td>24.77</td>\n",
       "      <td>109</td>\n",
       "      <td>77</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>109</td>\n",
       "      <td>601</td>\n",
       "      <td>704</td>\n",
       "      <td>3.000000e-05</td>\n",
       "      <td>45.8</td>\n",
       "      <td>Transposon TX1 uncharacterized 149 kDa protein...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>135</th>\n",
       "      <td>STRG.14687.1|m.92230</td>\n",
       "      <td>sp|P14381|YTX2_XENLA</td>\n",
       "      <td>33.82</td>\n",
       "      <td>136</td>\n",
       "      <td>72</td>\n",
       "      <td>3</td>\n",
       "      <td>53</td>\n",
       "      <td>188</td>\n",
       "      <td>453</td>\n",
       "      <td>570</td>\n",
       "      <td>1.000000e-12</td>\n",
       "      <td>69.7</td>\n",
       "      <td>Transposon TX1 uncharacterized 149 kDa protein...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>139</th>\n",
       "      <td>STRG.14732.1|m.72319</td>\n",
       "      <td>sp|P14381|YTX2_XENLA</td>\n",
       "      <td>27.18</td>\n",
       "      <td>401</td>\n",
       "      <td>259</td>\n",
       "      <td>9</td>\n",
       "      <td>46</td>\n",
       "      <td>442</td>\n",
       "      <td>453</td>\n",
       "      <td>824</td>\n",
       "      <td>1.000000e-33</td>\n",
       "      <td>140.0</td>\n",
       "      <td>Transposon TX1 uncharacterized 149 kDa protein...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>140</th>\n",
       "      <td>STRG.14732.1|m.72319</td>\n",
       "      <td>sp|P14381|YTX2_XENLA</td>\n",
       "      <td>26.17</td>\n",
       "      <td>493</td>\n",
       "      <td>314</td>\n",
       "      <td>12</td>\n",
       "      <td>46</td>\n",
       "      <td>532</td>\n",
       "      <td>453</td>\n",
       "      <td>901</td>\n",
       "      <td>7.000000e-34</td>\n",
       "      <td>141.0</td>\n",
       "      <td>Transposon TX1 uncharacterized 149 kDa protein...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>635</th>\n",
       "      <td>STRG.9714.1|m.338839</td>\n",
       "      <td>sp|O70338|RNH1_MOUSE</td>\n",
       "      <td>24.53</td>\n",
       "      <td>212</td>\n",
       "      <td>130</td>\n",
       "      <td>9</td>\n",
       "      <td>262</td>\n",
       "      <td>462</td>\n",
       "      <td>82</td>\n",
       "      <td>274</td>\n",
       "      <td>6.000000e-06</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Ribonuclease H1 OS=Mus musculus GN=Rnaseh1 PE=...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>636</th>\n",
       "      <td>STRG.9714.1|m.338839</td>\n",
       "      <td>sp|O70338|RNH1_MOUSE</td>\n",
       "      <td>24.53</td>\n",
       "      <td>212</td>\n",
       "      <td>130</td>\n",
       "      <td>9</td>\n",
       "      <td>82</td>\n",
       "      <td>282</td>\n",
       "      <td>82</td>\n",
       "      <td>274</td>\n",
       "      <td>3.000000e-06</td>\n",
       "      <td>52.4</td>\n",
       "      <td>Ribonuclease H1 OS=Mus musculus GN=Rnaseh1 PE=...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>637</th>\n",
       "      <td>STRG.9719.1|m.341368</td>\n",
       "      <td>sp|P08548|LIN1_NYCCO</td>\n",
       "      <td>27.27</td>\n",
       "      <td>187</td>\n",
       "      <td>124</td>\n",
       "      <td>4</td>\n",
       "      <td>30</td>\n",
       "      <td>210</td>\n",
       "      <td>454</td>\n",
       "      <td>634</td>\n",
       "      <td>2.000000e-15</td>\n",
       "      <td>79.7</td>\n",
       "      <td>LINE-1 reverse transcriptase homolog OS=Nyctic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>642</th>\n",
       "      <td>STRG.9734.1|m.339265</td>\n",
       "      <td>sp|P14381|YTX2_XENLA</td>\n",
       "      <td>24.14</td>\n",
       "      <td>145</td>\n",
       "      <td>100</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>141</td>\n",
       "      <td>470</td>\n",
       "      <td>608</td>\n",
       "      <td>2.000000e-05</td>\n",
       "      <td>47.0</td>\n",
       "      <td>Transposon TX1 uncharacterized 149 kDa protein...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>643</th>\n",
       "      <td>STRG.9766.1|m.333293</td>\n",
       "      <td>sp|O00370|LORF2_HUMAN</td>\n",
       "      <td>23.66</td>\n",
       "      <td>224</td>\n",
       "      <td>156</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>223</td>\n",
       "      <td>620</td>\n",
       "      <td>832</td>\n",
       "      <td>7.000000e-10</td>\n",
       "      <td>62.4</td>\n",
       "      <td>LINE-1 retrotransposable element ORF2 protein ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>647</th>\n",
       "      <td>STRG.9783.1|m.339739</td>\n",
       "      <td>sp|P14381|YTX2_XENLA</td>\n",
       "      <td>33.72</td>\n",
       "      <td>86</td>\n",
       "      <td>57</td>\n",
       "      <td>0</td>\n",
       "      <td>29</td>\n",
       "      <td>114</td>\n",
       "      <td>464</td>\n",
       "      <td>549</td>\n",
       "      <td>3.000000e-08</td>\n",
       "      <td>55.8</td>\n",
       "      <td>Transposon TX1 uncharacterized 149 kDa protein...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>652</th>\n",
       "      <td>STRG.9973.1</td>\n",
       "      <td>sp|Q9UR07|TF211_SCHPO</td>\n",
       "      <td>27.08</td>\n",
       "      <td>506</td>\n",
       "      <td>330</td>\n",
       "      <td>12</td>\n",
       "      <td>29</td>\n",
       "      <td>511</td>\n",
       "      <td>798</td>\n",
       "      <td>1287</td>\n",
       "      <td>6.000000e-52</td>\n",
       "      <td>197.0</td>\n",
       "      <td>Transposon Tf2-11 polyprotein OS=Schizosacchar...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>654</th>\n",
       "      <td>STRG.9974.1|m.344312</td>\n",
       "      <td>sp|P11369|LORF2_MOUSE</td>\n",
       "      <td>29.77</td>\n",
       "      <td>131</td>\n",
       "      <td>84</td>\n",
       "      <td>4</td>\n",
       "      <td>457</td>\n",
       "      <td>582</td>\n",
       "      <td>466</td>\n",
       "      <td>593</td>\n",
       "      <td>2.000000e-06</td>\n",
       "      <td>54.7</td>\n",
       "      <td>LINE-1 retrotransposable element ORF2 protein ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>664</th>\n",
       "      <td>STRG.10332.1|m.348972</td>\n",
       "      <td>sp|O70338|RNH1_MOUSE</td>\n",
       "      <td>24.06</td>\n",
       "      <td>212</td>\n",
       "      <td>131</td>\n",
       "      <td>9</td>\n",
       "      <td>459</td>\n",
       "      <td>659</td>\n",
       "      <td>82</td>\n",
       "      <td>274</td>\n",
       "      <td>3.000000e-04</td>\n",
       "      <td>47.4</td>\n",
       "      <td>Ribonuclease H1 OS=Mus musculus GN=Rnaseh1 PE=...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>665</th>\n",
       "      <td>STRG.10332.1|m.348972</td>\n",
       "      <td>sp|O70338|RNH1_MOUSE</td>\n",
       "      <td>24.06</td>\n",
       "      <td>212</td>\n",
       "      <td>131</td>\n",
       "      <td>9</td>\n",
       "      <td>364</td>\n",
       "      <td>564</td>\n",
       "      <td>82</td>\n",
       "      <td>274</td>\n",
       "      <td>2.000000e-04</td>\n",
       "      <td>47.8</td>\n",
       "      <td>Ribonuclease H1 OS=Mus musculus GN=Rnaseh1 PE=...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>681</th>\n",
       "      <td>STRG.10736.1|m.347298</td>\n",
       "      <td>sp|O00370|LORF2_HUMAN</td>\n",
       "      <td>25.20</td>\n",
       "      <td>369</td>\n",
       "      <td>223</td>\n",
       "      <td>9</td>\n",
       "      <td>558</td>\n",
       "      <td>918</td>\n",
       "      <td>448</td>\n",
       "      <td>771</td>\n",
       "      <td>4.000000e-23</td>\n",
       "      <td>110.0</td>\n",
       "      <td>LINE-1 retrotransposable element ORF2 protein ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>685</th>\n",
       "      <td>STRG.10990.1|m.350822</td>\n",
       "      <td>sp|Q0WPT7|Y2104_ARATH</td>\n",
       "      <td>45.45</td>\n",
       "      <td>319</td>\n",
       "      <td>146</td>\n",
       "      <td>8</td>\n",
       "      <td>82</td>\n",
       "      <td>391</td>\n",
       "      <td>53</td>\n",
       "      <td>352</td>\n",
       "      <td>3.000000e-85</td>\n",
       "      <td>270.0</td>\n",
       "      <td>Uncharacterized methyltransferase At2g41040, c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>686</th>\n",
       "      <td>STRG.10990.1|m.350822</td>\n",
       "      <td>sp|Q0WPT7|Y2104_ARATH</td>\n",
       "      <td>44.24</td>\n",
       "      <td>321</td>\n",
       "      <td>149</td>\n",
       "      <td>9</td>\n",
       "      <td>82</td>\n",
       "      <td>393</td>\n",
       "      <td>53</td>\n",
       "      <td>352</td>\n",
       "      <td>5.000000e-81</td>\n",
       "      <td>259.0</td>\n",
       "      <td>Uncharacterized methyltransferase At2g41040, c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>687</th>\n",
       "      <td>STRG.10970.1|m.368049</td>\n",
       "      <td>sp|P0CT36|TF23_SCHPO</td>\n",
       "      <td>30.39</td>\n",
       "      <td>622</td>\n",
       "      <td>396</td>\n",
       "      <td>11</td>\n",
       "      <td>2</td>\n",
       "      <td>596</td>\n",
       "      <td>577</td>\n",
       "      <td>1188</td>\n",
       "      <td>2.000000e-85</td>\n",
       "      <td>298.0</td>\n",
       "      <td>Transposon Tf2-3 polyprotein OS=Schizosaccharo...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>691</th>\n",
       "      <td>STRG.11177.1|m.361017</td>\n",
       "      <td>sp|P14381|YTX2_XENLA</td>\n",
       "      <td>30.22</td>\n",
       "      <td>225</td>\n",
       "      <td>131</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>225</td>\n",
       "      <td>462</td>\n",
       "      <td>662</td>\n",
       "      <td>1.000000e-22</td>\n",
       "      <td>101.0</td>\n",
       "      <td>Transposon TX1 uncharacterized 149 kDa protein...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>699</th>\n",
       "      <td>STRG.11195.1|m.361228</td>\n",
       "      <td>sp|P14381|YTX2_XENLA</td>\n",
       "      <td>25.95</td>\n",
       "      <td>185</td>\n",
       "      <td>129</td>\n",
       "      <td>3</td>\n",
       "      <td>11</td>\n",
       "      <td>193</td>\n",
       "      <td>583</td>\n",
       "      <td>761</td>\n",
       "      <td>3.000000e-11</td>\n",
       "      <td>65.5</td>\n",
       "      <td>Transposon TX1 uncharacterized 149 kDa protein...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>704</th>\n",
       "      <td>STRG.11206.1|m.365581</td>\n",
       "      <td>sp|O00370|LORF2_HUMAN</td>\n",
       "      <td>25.21</td>\n",
       "      <td>119</td>\n",
       "      <td>84</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>124</td>\n",
       "      <td>627</td>\n",
       "      <td>740</td>\n",
       "      <td>2.000000e-04</td>\n",
       "      <td>43.9</td>\n",
       "      <td>LINE-1 retrotransposable element ORF2 protein ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>705</th>\n",
       "      <td>STRG.11247.1|m.360734</td>\n",
       "      <td>sp|P14381|YTX2_XENLA</td>\n",
       "      <td>26.27</td>\n",
       "      <td>354</td>\n",
       "      <td>235</td>\n",
       "      <td>8</td>\n",
       "      <td>14</td>\n",
       "      <td>363</td>\n",
       "      <td>493</td>\n",
       "      <td>824</td>\n",
       "      <td>4.000000e-27</td>\n",
       "      <td>119.0</td>\n",
       "      <td>Transposon TX1 uncharacterized 149 kDa protein...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>712</th>\n",
       "      <td>STRG.11284.1|m.351559</td>\n",
       "      <td>sp|P14381|YTX2_XENLA</td>\n",
       "      <td>21.32</td>\n",
       "      <td>258</td>\n",
       "      <td>177</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>254</td>\n",
       "      <td>666</td>\n",
       "      <td>901</td>\n",
       "      <td>1.000000e-03</td>\n",
       "      <td>43.9</td>\n",
       "      <td>Transposon TX1 uncharacterized 149 kDa protein...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>721</th>\n",
       "      <td>STRG.11672.3|m.369374</td>\n",
       "      <td>sp|P08548|LIN1_NYCCO</td>\n",
       "      <td>22.20</td>\n",
       "      <td>446</td>\n",
       "      <td>329</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "      <td>447</td>\n",
       "      <td>389</td>\n",
       "      <td>825</td>\n",
       "      <td>9.000000e-36</td>\n",
       "      <td>149.0</td>\n",
       "      <td>LINE-1 reverse transcriptase homolog OS=Nyctic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>722</th>\n",
       "      <td>STRG.11672.3|m.369374</td>\n",
       "      <td>sp|P08548|LIN1_NYCCO</td>\n",
       "      <td>22.20</td>\n",
       "      <td>446</td>\n",
       "      <td>329</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "      <td>447</td>\n",
       "      <td>389</td>\n",
       "      <td>825</td>\n",
       "      <td>2.000000e-35</td>\n",
       "      <td>149.0</td>\n",
       "      <td>LINE-1 reverse transcriptase homolog OS=Nyctic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>731</th>\n",
       "      <td>STRG.20805.1|m.374316</td>\n",
       "      <td>sp|Q2KXY7|IF2_BORA1</td>\n",
       "      <td>29.14</td>\n",
       "      <td>175</td>\n",
       "      <td>102</td>\n",
       "      <td>6</td>\n",
       "      <td>166</td>\n",
       "      <td>326</td>\n",
       "      <td>125</td>\n",
       "      <td>291</td>\n",
       "      <td>4.000000e-05</td>\n",
       "      <td>49.3</td>\n",
       "      <td>Translation initiation factor IF-2 OS=Bordetel...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>734</th>\n",
       "      <td>STRG.20841.1|m.374260</td>\n",
       "      <td>sp|P23504|SPAP_STRMU</td>\n",
       "      <td>27.78</td>\n",
       "      <td>126</td>\n",
       "      <td>76</td>\n",
       "      <td>5</td>\n",
       "      <td>147</td>\n",
       "      <td>262</td>\n",
       "      <td>803</td>\n",
       "      <td>923</td>\n",
       "      <td>2.000000e-04</td>\n",
       "      <td>45.8</td>\n",
       "      <td>Cell surface antigen I/II OS=Streptococcus mut...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>736</th>\n",
       "      <td>STRG.20843.1|m.374174</td>\n",
       "      <td>sp|P14381|YTX2_XENLA</td>\n",
       "      <td>34.17</td>\n",
       "      <td>120</td>\n",
       "      <td>65</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>119</td>\n",
       "      <td>506</td>\n",
       "      <td>613</td>\n",
       "      <td>2.000000e-10</td>\n",
       "      <td>61.6</td>\n",
       "      <td>Transposon TX1 uncharacterized 149 kDa protein...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>751</th>\n",
       "      <td>STRG.20922.1|m.374953</td>\n",
       "      <td>sp|O70338|RNH1_MOUSE</td>\n",
       "      <td>24.77</td>\n",
       "      <td>218</td>\n",
       "      <td>122</td>\n",
       "      <td>10</td>\n",
       "      <td>31</td>\n",
       "      <td>231</td>\n",
       "      <td>82</td>\n",
       "      <td>274</td>\n",
       "      <td>9.000000e-06</td>\n",
       "      <td>50.8</td>\n",
       "      <td>Ribonuclease H1 OS=Mus musculus GN=Rnaseh1 PE=...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>757</th>\n",
       "      <td>STRG.20978.1|m.375390</td>\n",
       "      <td>sp|P14381|YTX2_XENLA</td>\n",
       "      <td>30.47</td>\n",
       "      <td>128</td>\n",
       "      <td>84</td>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "      <td>131</td>\n",
       "      <td>132</td>\n",
       "      <td>256</td>\n",
       "      <td>3.000000e-08</td>\n",
       "      <td>55.5</td>\n",
       "      <td>Transposon TX1 uncharacterized 149 kDa protein...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>773</th>\n",
       "      <td>STRG.21133.1|m.376131</td>\n",
       "      <td>sp|O70338|RNH1_MOUSE</td>\n",
       "      <td>24.53</td>\n",
       "      <td>212</td>\n",
       "      <td>130</td>\n",
       "      <td>9</td>\n",
       "      <td>322</td>\n",
       "      <td>522</td>\n",
       "      <td>82</td>\n",
       "      <td>274</td>\n",
       "      <td>2.000000e-05</td>\n",
       "      <td>50.8</td>\n",
       "      <td>Ribonuclease H1 OS=Mus musculus GN=Rnaseh1 PE=...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>774</th>\n",
       "      <td>STRG.21120.1|m.376062</td>\n",
       "      <td>sp|Q95SX7|RTBS_DROME</td>\n",
       "      <td>27.42</td>\n",
       "      <td>124</td>\n",
       "      <td>89</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>130</td>\n",
       "      <td>512</td>\n",
       "      <td>634</td>\n",
       "      <td>5.000000e-04</td>\n",
       "      <td>42.0</td>\n",
       "      <td>Probable RNA-directed DNA polymerase from tran...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>783</th>\n",
       "      <td>STRG.21207.2|m.376540</td>\n",
       "      <td>sp|A0QUV5|Y2350_MYCS2</td>\n",
       "      <td>30.77</td>\n",
       "      <td>182</td>\n",
       "      <td>86</td>\n",
       "      <td>7</td>\n",
       "      <td>155</td>\n",
       "      <td>333</td>\n",
       "      <td>100</td>\n",
       "      <td>244</td>\n",
       "      <td>1.000000e-08</td>\n",
       "      <td>58.9</td>\n",
       "      <td>Probable S-adenosylmethionine-dependent methyl...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>784</th>\n",
       "      <td>STRG.21207.2|m.376540</td>\n",
       "      <td>sp|A0QUV5|Y2350_MYCS2</td>\n",
       "      <td>30.77</td>\n",
       "      <td>182</td>\n",
       "      <td>86</td>\n",
       "      <td>7</td>\n",
       "      <td>155</td>\n",
       "      <td>333</td>\n",
       "      <td>100</td>\n",
       "      <td>244</td>\n",
       "      <td>1.000000e-08</td>\n",
       "      <td>58.9</td>\n",
       "      <td>Probable S-adenosylmethionine-dependent methyl...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>146 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                         0                      1       2     3    4   5  \\\n",
       "0        STRG.28.2|m.13357   sp|P16423|POLR_DROME   25.61   453  284  12   \n",
       "1        STRG.28.2|m.13357   sp|P16423|POLR_DROME   25.61   453  284  12   \n",
       "6       STRG.184.1|m.26632   sp|Q9VMJ7|KDM5_DROME   39.53    86   46   2   \n",
       "7       STRG.185.1|m.15264     sp|Q66H30|GIN1_RAT   28.14   199  135   5   \n",
       "26       STRG.261.1|m.2566     sp|O04279|GPA2_PEA   27.91    86   53   1   \n",
       "27      STRG.257.1|m.16086   sp|P14381|YTX2_XENLA   26.59   173  107   4   \n",
       "28              STRG.467.2    sp|Q5XJW2|G45IP_RAT   38.10    42   26   0   \n",
       "34              STRG.559.1  sp|Q91YT2|RN185_MOUSE   35.09    57   33   2   \n",
       "36       STRG.807.7|m.2445  sp|Q9FNQ1|DEXHE_ARATH   47.20  1125  480  18   \n",
       "37       STRG.807.7|m.2445  sp|Q9FNQ1|DEXHE_ARATH   35.08   630  388   8   \n",
       "39             STRG.1031.2   sp|P34152|FAK1_MOUSE   33.82   207  110  11   \n",
       "46             STRG.1131.1   sp|O77374|PF07_PLAF7   31.94    72   43   3   \n",
       "47             STRG.1131.1   sp|O77374|PF07_PLAF7   31.94    72   43   3   \n",
       "60    STRG.12019.1|m.36072  sp|Q3U4G0|CO041_MOUSE   34.80   273  137   9   \n",
       "62    STRG.12054.1|m.50679  sp|O96185|YPF08_PLAF7   36.54   104   53  10   \n",
       "63    STRG.12054.1|m.50679  sp|O96185|YPF08_PLAF7   36.00    75   38   7   \n",
       "67    STRG.12109.1|m.31938   sp|P08548|LIN1_NYCCO   22.79   351  248  10   \n",
       "69    STRG.12117.1|m.31903   sp|P08548|LIN1_NYCCO   22.84   359  260   9   \n",
       "81    STRG.12550.1|m.41810   sp|P14381|YTX2_XENLA   30.26   228  152   3   \n",
       "88    STRG.12846.1|m.63519   sp|P16423|POLR_DROME   25.61   453  284  12   \n",
       "95    STRG.13248.1|m.59691   sp|Q9SPI9|PSBW_CHLRE  100.00   115    0   0   \n",
       "96            STRG.13462.5  sp|Q6PBN4|CQ10X_DANRE   34.55    55   36   0   \n",
       "112   STRG.13741.1|m.83638  sp|P11369|LORF2_MOUSE   32.14    84   57   0   \n",
       "115           STRG.13853.1  sp|A2AMZ4|CQ089_MOUSE   32.20    59   29   2   \n",
       "127   STRG.14659.1|m.80009   sp|P14381|YTX2_XENLA   29.03   155  102   3   \n",
       "128   STRG.14661.1|m.84607   sp|P14381|YTX2_XENLA   29.92   127   83   4   \n",
       "134   STRG.14686.1|m.77912   sp|P14381|YTX2_XENLA   24.77   109   77   1   \n",
       "135   STRG.14687.1|m.92230   sp|P14381|YTX2_XENLA   33.82   136   72   3   \n",
       "139   STRG.14732.1|m.72319   sp|P14381|YTX2_XENLA   27.18   401  259   9   \n",
       "140   STRG.14732.1|m.72319   sp|P14381|YTX2_XENLA   26.17   493  314  12   \n",
       "..                     ...                    ...     ...   ...  ...  ..   \n",
       "635   STRG.9714.1|m.338839   sp|O70338|RNH1_MOUSE   24.53   212  130   9   \n",
       "636   STRG.9714.1|m.338839   sp|O70338|RNH1_MOUSE   24.53   212  130   9   \n",
       "637   STRG.9719.1|m.341368   sp|P08548|LIN1_NYCCO   27.27   187  124   4   \n",
       "642   STRG.9734.1|m.339265   sp|P14381|YTX2_XENLA   24.14   145  100   2   \n",
       "643   STRG.9766.1|m.333293  sp|O00370|LORF2_HUMAN   23.66   224  156   4   \n",
       "647   STRG.9783.1|m.339739   sp|P14381|YTX2_XENLA   33.72    86   57   0   \n",
       "652            STRG.9973.1  sp|Q9UR07|TF211_SCHPO   27.08   506  330  12   \n",
       "654   STRG.9974.1|m.344312  sp|P11369|LORF2_MOUSE   29.77   131   84   4   \n",
       "664  STRG.10332.1|m.348972   sp|O70338|RNH1_MOUSE   24.06   212  131   9   \n",
       "665  STRG.10332.1|m.348972   sp|O70338|RNH1_MOUSE   24.06   212  131   9   \n",
       "681  STRG.10736.1|m.347298  sp|O00370|LORF2_HUMAN   25.20   369  223   9   \n",
       "685  STRG.10990.1|m.350822  sp|Q0WPT7|Y2104_ARATH   45.45   319  146   8   \n",
       "686  STRG.10990.1|m.350822  sp|Q0WPT7|Y2104_ARATH   44.24   321  149   9   \n",
       "687  STRG.10970.1|m.368049   sp|P0CT36|TF23_SCHPO   30.39   622  396  11   \n",
       "691  STRG.11177.1|m.361017   sp|P14381|YTX2_XENLA   30.22   225  131   3   \n",
       "699  STRG.11195.1|m.361228   sp|P14381|YTX2_XENLA   25.95   185  129   3   \n",
       "704  STRG.11206.1|m.365581  sp|O00370|LORF2_HUMAN   25.21   119   84   1   \n",
       "705  STRG.11247.1|m.360734   sp|P14381|YTX2_XENLA   26.27   354  235   8   \n",
       "712  STRG.11284.1|m.351559   sp|P14381|YTX2_XENLA   21.32   258  177   7   \n",
       "721  STRG.11672.3|m.369374   sp|P08548|LIN1_NYCCO   22.20   446  329  10   \n",
       "722  STRG.11672.3|m.369374   sp|P08548|LIN1_NYCCO   22.20   446  329  10   \n",
       "731  STRG.20805.1|m.374316    sp|Q2KXY7|IF2_BORA1   29.14   175  102   6   \n",
       "734  STRG.20841.1|m.374260   sp|P23504|SPAP_STRMU   27.78   126   76   5   \n",
       "736  STRG.20843.1|m.374174   sp|P14381|YTX2_XENLA   34.17   120   65   4   \n",
       "751  STRG.20922.1|m.374953   sp|O70338|RNH1_MOUSE   24.77   218  122  10   \n",
       "757  STRG.20978.1|m.375390   sp|P14381|YTX2_XENLA   30.47   128   84   4   \n",
       "773  STRG.21133.1|m.376131   sp|O70338|RNH1_MOUSE   24.53   212  130   9   \n",
       "774  STRG.21120.1|m.376062   sp|Q95SX7|RTBS_DROME   27.42   124   89   1   \n",
       "783  STRG.21207.2|m.376540  sp|A0QUV5|Y2350_MYCS2   30.77   182   86   7   \n",
       "784  STRG.21207.2|m.376540  sp|A0QUV5|Y2350_MYCS2   30.77   182   86   7   \n",
       "\n",
       "        6     7     8     9            10     11  \\\n",
       "0    1238  1685   354   758  4.000000e-22  108.0   \n",
       "1    1238  1685   354   758  4.000000e-22  108.0   \n",
       "6     191   270   445   530  6.000000e-12   72.0   \n",
       "7      37   230    15   210  2.000000e-16   81.6   \n",
       "26     38   114   146   231  6.000000e-03   40.0   \n",
       "27      4   174   550   704  3.000000e-08   56.6   \n",
       "28     69   110   107   148  7.000000e-03   38.5   \n",
       "34    135   188    37    92  1.000000e-08   55.5   \n",
       "36     91  1211   115  1129  0.000000e+00  959.0   \n",
       "37    581  1209  1349  1958  4.000000e-97  347.0   \n",
       "39   1373  1572   598   784  1.000000e-09   67.0   \n",
       "46     87   156   847   914  5.000000e-03   40.8   \n",
       "47     87   156   847   914  5.000000e-03   40.8   \n",
       "60     52   307    28   276  1.000000e-44  162.0   \n",
       "62     54   145   524   626  4.000000e-07   54.7   \n",
       "63     76   144   519   589  4.000000e-03   42.4   \n",
       "67      4   342   489   828  5.000000e-23  107.0   \n",
       "69      2   351   478   828  5.000000e-26  115.0   \n",
       "81      2   225   515   739  1.000000e-21   99.0   \n",
       "88   1186  1633   354   758  2.000000e-22  109.0   \n",
       "95      1   115     1   115  1.000000e-77  231.0   \n",
       "96      7    61   105   159  1.000000e-04   42.0   \n",
       "112     6    89   606   689  2.000000e-09   58.2   \n",
       "115     9    64    15    65  6.000000e-03   35.0   \n",
       "127    19   171   583   731  4.000000e-11   65.1   \n",
       "128     2   123   121   246  3.000000e-08   55.1   \n",
       "134     1   109   601   704  3.000000e-05   45.8   \n",
       "135    53   188   453   570  1.000000e-12   69.7   \n",
       "139    46   442   453   824  1.000000e-33  140.0   \n",
       "140    46   532   453   901  7.000000e-34  141.0   \n",
       "..    ...   ...   ...   ...           ...    ...   \n",
       "635   262   462    82   274  6.000000e-06   52.0   \n",
       "636    82   282    82   274  3.000000e-06   52.4   \n",
       "637    30   210   454   634  2.000000e-15   79.7   \n",
       "642     1   141   470   608  2.000000e-05   47.0   \n",
       "643     4   223   620   832  7.000000e-10   62.4   \n",
       "647    29   114   464   549  3.000000e-08   55.8   \n",
       "652    29   511   798  1287  6.000000e-52  197.0   \n",
       "654   457   582   466   593  2.000000e-06   54.7   \n",
       "664   459   659    82   274  3.000000e-04   47.4   \n",
       "665   364   564    82   274  2.000000e-04   47.8   \n",
       "681   558   918   448   771  4.000000e-23  110.0   \n",
       "685    82   391    53   352  3.000000e-85  270.0   \n",
       "686    82   393    53   352  5.000000e-81  259.0   \n",
       "687     2   596   577  1188  2.000000e-85  298.0   \n",
       "691     3   225   462   662  1.000000e-22  101.0   \n",
       "699    11   193   583   761  3.000000e-11   65.5   \n",
       "704     6   124   627   740  2.000000e-04   43.9   \n",
       "705    14   363   493   824  4.000000e-27  119.0   \n",
       "712     1   254   666   901  1.000000e-03   43.9   \n",
       "721    11   447   389   825  9.000000e-36  149.0   \n",
       "722    11   447   389   825  2.000000e-35  149.0   \n",
       "731   166   326   125   291  4.000000e-05   49.3   \n",
       "734   147   262   803   923  2.000000e-04   45.8   \n",
       "736     2   119   506   613  2.000000e-10   61.6   \n",
       "751    31   231    82   274  9.000000e-06   50.8   \n",
       "757     6   131   132   256  3.000000e-08   55.5   \n",
       "773   322   522    82   274  2.000000e-05   50.8   \n",
       "774     7   130   512   634  5.000000e-04   42.0   \n",
       "783   155   333   100   244  1.000000e-08   58.9   \n",
       "784   155   333   100   244  1.000000e-08   58.9   \n",
       "\n",
       "                                            annotation  \n",
       "0    Retrovirus-related Pol polyprotein from type-2...  \n",
       "1    Retrovirus-related Pol polyprotein from type-2...  \n",
       "6    Lysine-specific demethylase lid OS=Drosophila ...  \n",
       "7    Gypsy retrotransposon integrase-like protein 1...  \n",
       "26   Guanine nucleotide-binding protein alpha-2 sub...  \n",
       "27   Transposon TX1 uncharacterized 149 kDa protein...  \n",
       "28   Growth arrest and DNA damage-inducible protein...  \n",
       "34   E3 ubiquitin-protein ligase RNF185 OS=Mus musc...  \n",
       "36   DExH-box ATP-dependent RNA helicase DExH14 OS=...  \n",
       "37   DExH-box ATP-dependent RNA helicase DExH14 OS=...  \n",
       "39   Focal adhesion kinase 1 OS=Mus musculus GN=Ptk...  \n",
       "46   Uncharacterized protein PFC0810c OS=Plasmodium...  \n",
       "47   Uncharacterized protein PFC0810c OS=Plasmodium...  \n",
       "60   Uncharacterized protein C15orf41 homolog OS=Mu...  \n",
       "62   Uncharacterized protein PFB0460c OS=Plasmodium...  \n",
       "63   Uncharacterized protein PFB0460c OS=Plasmodium...  \n",
       "67   LINE-1 reverse transcriptase homolog OS=Nyctic...  \n",
       "69   LINE-1 reverse transcriptase homolog OS=Nyctic...  \n",
       "81   Transposon TX1 uncharacterized 149 kDa protein...  \n",
       "88   Retrovirus-related Pol polyprotein from type-2...  \n",
       "95   Photosystem II reaction center W protein, chlo...  \n",
       "96   Coenzyme Q-binding protein COQ10 homolog, mito...  \n",
       "112  LINE-1 retrotransposable element ORF2 protein ...  \n",
       "115  Uncharacterized protein C17orf89 homolog OS=Mu...  \n",
       "127  Transposon TX1 uncharacterized 149 kDa protein...  \n",
       "128  Transposon TX1 uncharacterized 149 kDa protein...  \n",
       "134  Transposon TX1 uncharacterized 149 kDa protein...  \n",
       "135  Transposon TX1 uncharacterized 149 kDa protein...  \n",
       "139  Transposon TX1 uncharacterized 149 kDa protein...  \n",
       "140  Transposon TX1 uncharacterized 149 kDa protein...  \n",
       "..                                                 ...  \n",
       "635  Ribonuclease H1 OS=Mus musculus GN=Rnaseh1 PE=...  \n",
       "636  Ribonuclease H1 OS=Mus musculus GN=Rnaseh1 PE=...  \n",
       "637  LINE-1 reverse transcriptase homolog OS=Nyctic...  \n",
       "642  Transposon TX1 uncharacterized 149 kDa protein...  \n",
       "643  LINE-1 retrotransposable element ORF2 protein ...  \n",
       "647  Transposon TX1 uncharacterized 149 kDa protein...  \n",
       "652  Transposon Tf2-11 polyprotein OS=Schizosacchar...  \n",
       "654  LINE-1 retrotransposable element ORF2 protein ...  \n",
       "664  Ribonuclease H1 OS=Mus musculus GN=Rnaseh1 PE=...  \n",
       "665  Ribonuclease H1 OS=Mus musculus GN=Rnaseh1 PE=...  \n",
       "681  LINE-1 retrotransposable element ORF2 protein ...  \n",
       "685  Uncharacterized methyltransferase At2g41040, c...  \n",
       "686  Uncharacterized methyltransferase At2g41040, c...  \n",
       "687  Transposon Tf2-3 polyprotein OS=Schizosaccharo...  \n",
       "691  Transposon TX1 uncharacterized 149 kDa protein...  \n",
       "699  Transposon TX1 uncharacterized 149 kDa protein...  \n",
       "704  LINE-1 retrotransposable element ORF2 protein ...  \n",
       "705  Transposon TX1 uncharacterized 149 kDa protein...  \n",
       "712  Transposon TX1 uncharacterized 149 kDa protein...  \n",
       "721  LINE-1 reverse transcriptase homolog OS=Nyctic...  \n",
       "722  LINE-1 reverse transcriptase homolog OS=Nyctic...  \n",
       "731  Translation initiation factor IF-2 OS=Bordetel...  \n",
       "734  Cell surface antigen I/II OS=Streptococcus mut...  \n",
       "736  Transposon TX1 uncharacterized 149 kDa protein...  \n",
       "751  Ribonuclease H1 OS=Mus musculus GN=Rnaseh1 PE=...  \n",
       "757  Transposon TX1 uncharacterized 149 kDa protein...  \n",
       "773  Ribonuclease H1 OS=Mus musculus GN=Rnaseh1 PE=...  \n",
       "774  Probable RNA-directed DNA polymerase from tran...  \n",
       "783  Probable S-adenosylmethionine-dependent methyl...  \n",
       "784  Probable S-adenosylmethionine-dependent methyl...  \n",
       "\n",
       "[146 rows x 13 columns]"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# new protein annot with uniprot \n",
    "file_bp = 'new_gene.fa.bp.na1.uniprot-all.fasta.out7'\n",
    "df_bp   = pd.read_csv(file_bp,sep='\\t',header=None,comment='#')\n",
    "df_bp.head()\n",
    "\n",
    "file_uniprot = '/ref/analysis/References/uniprot/uniprot-all.fasta'\n",
    "dic_uniprot  = kang.Fasta2dic_all(file_uniprot)\n",
    "\n",
    "keys = [x.split()[0] for x in dic_uniprot.keys()]\n",
    "values = [' '.join(x.split()[1:]) for x in dic_uniprot.keys()]\n",
    "dic_uniprot_IDmap = dict(zip(keys,values))\n",
    "\n",
    "df_bp['annotation'] = df_bp[1].apply(lambda x : dic_uniprot_IDmap[x])\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "df_bp[df_bp[10] < 0.1].to_csv(file_bp+'.annot',sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
